import numpy as np
import matplotlib.pyplot as plt
class LinearRegression:
def __init__(self, learning_rate=0.01, n_iterations=1000):
self.learning_rate = learning_rate
self.n_iterations = n_iterations
def fit(self, X, y):
X = np.array(X).reshape(-1, 1)
y = np.array(y).reshape(-1, 1)
n = len(X)
m, b = self._initialize_parameters()
costs = []
for _ in range(self.n_iterations):
y_pred = self._predict(X, m, b)
dm, db = self._compute_gradients(X, y, y_pred)
m, b = self._update_parameters(m, b, dm, db)
cost = self._compute_cost(y, y_pred)
costs.append(cost)
return m, b, costs
def _initialize_parameters(self):
return 0, 0
def _predict(self, X, m, b):
return X * m + b
def _compute_gradients(self, X, y, y_pred):
n = len(X)
dm = (-2 / n) * np.sum(X * (y - y_pred))
db = (-2 / n) * np.sum(y - y_pred)
return dm, db
def _update_parameters(self, m, b, dm, db):
m -= self.learning_rate * dm
b -= self.learning_rate * db
return m, b
def _compute_cost(self, y, y_pred):
n = len(y)
cost = (1 / (2 * n)) * np.sum((y - y_pred) ** 2)
return cost
def plot(self, X, y, m, b, costs):
X = np.array(X).reshape(-1, 1)
y_pred = self._predict(X, m, b)
plt.figure(figsize=(14, 7))
# Scatter plot of data points with regression line
plt.subplot(1, 2, 1)
plt.scatter(X, y, color='blue', label='Data points', edgecolor='k', s=70, linewidth=1, alpha= 0.9)
plt.plot(X, y_pred, color='skyblue', label='Regression line', linewidth=2)
plt.xlabel("X (Feature)", fontsize=12)
plt.ylabel("y (Outcome)", fontsize=12)
plt.title("Linear Regression with Gradient Descent", fontsize=14)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.6)
# Plot the cost history with annotations
plt.subplot(1, 2, 2)
plt.plot(range(1, len(costs) + 1), costs, color='green', linewidth=2, marker='o', markersize=5, alpha= 0.9)
plt.xlabel("Iteration", fontsize=12)
plt.ylabel("Cost", fontsize=12)
plt.title("Cost vs Iteration", fontsize=14)
# Annotate minimum cost
min_cost_index = np.argmin(costs)
plt.annotate(f"Min Cost: {costs[min_cost_index]:.5f}",
xy=(min_cost_index + 1, costs[min_cost_index]),
xytext=(min_cost_index + 50, costs[min_cost_index] + 0.5),
arrowprops=dict(facecolor='black', shrink=0.05),
fontsize=12, color='black')
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()
# Provided dataset
X = np.array([1, 2, 3, 4, 5])
y = np.array([1, 2, 1.3, 3.75, 2.25])
# Create an instance of the LinearRegression class
lr = LinearRegression()
# Fit the model to the dataset
m, b, costs = lr.fit(X, y)
# Plot the regression line and cost graph
lr.plot(X, y, m, b, costs)
import pandas as pd
from ydata_profiling import ProfileReport
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
df = pd.read_csv(r"C:\Users\acer\Downloads\cardata.csv")
report = ProfileReport(df).to_notebook_iframe()
report
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
df.isnull().any()
Car_Name True Year True Selling_Price True Present_Price True Kms_Driven True Fuel_Type False Seller_Type False Transmission True Owner False dtype: bool
df.isnull().sum()
Car_Name 1 Year 1 Selling_Price 4 Present_Price 1 Kms_Driven 2 Fuel_Type 0 Seller_Type 0 Transmission 2 Owner 0 dtype: int64
#Impute missing values for numerical columns
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df.select_dtypes(include='object').columns.tolist()
# Impute missing values for numerical columns using Median
for col in numerical_cols:
if df[col].isnull().any():
median_value = df[col].median()
df[col].fillna(median_value, inplace=True)
print(f'\nFilled missing values in {col} with median: {median_value}')
# Impute missing values for categorical columns using Mode
for col in categorical_cols:
if df[col].isnull().any():
mode_value = df[col].mode()[0]
df[col].fillna(mode_value, inplace=True)
print(f'\nFilled missing values in {col} with mode: {mode_value}')
Filled missing values in Year with median: 2014.0 Filled missing values in Selling_Price with median: 3.6 Filled missing values in Present_Price with median: 6.445 Filled missing values in Kms_Driven with median: 32000.0 Filled missing values in Car_Name with mode: city Filled missing values in Transmission with mode: Manual
df.isnull().any()
Car_Name False Year False Selling_Price False Present_Price False Kms_Driven False Fuel_Type False Seller_Type False Transmission False Owner False dtype: bool
df.duplicated().any()
True
df[df.duplicated()]
| Car_Name | Year | Selling_Price | Present_Price | Kms_Driven | Fuel_Type | Seller_Type | Transmission | Owner | |
|---|---|---|---|---|---|---|---|---|---|
| 93 | 80 | 2015.0 | 13.65 | 22.95 | 40000.0 | 1 | 0 | 0 | 0 |
df[df.duplicated(keep=False)]
| Car_Name | Year | Selling_Price | Present_Price | Kms_Driven | Fuel_Type | Seller_Type | Transmission | Owner | |
|---|---|---|---|---|---|---|---|---|---|
| 51 | 80 | 2015.0 | 13.65 | 22.95 | 40000.0 | 1 | 0 | 0 | 0 |
| 93 | 80 | 2015.0 | 13.65 | 22.95 | 40000.0 | 1 | 0 | 0 | 0 |
df1 = df.copy()
df1.drop_duplicates(inplace=True)
print(f"\033[91m Befor drop duplicate data: rows= {df.shape[0]} | columns= {df.shape[1]}")
Befor drop duplicate data: rows= 301 | columns= 9
print(f"\033[92m After drop duplicate data: rows= {df1.shape[0]} | columns= {df1.shape[1]}")
After drop duplicate data: rows= 300 | columns= 9
def calculate_iqr(df):
"""
Calculate Q1, Q3, and IQR for each column in the dataframe.
"""
Q1 = df.quantile(0.25, numeric_only=True)
Q3 = df.quantile(0.75, numeric_only=True)
IQR = Q3 - Q1
return Q1, Q3, IQR
def outlier_detect(df, col, Q1, Q3, IQR):
"""
Detect outliers in numerical columns using IQR method.
"""
q1_col = Q1[col]
iqr_col = IQR[col]
q3_col = Q3[col]
return df[((df[col] < (q1_col - 1.5 * iqr_col)) | (df[col] > (q3_col + 1.5 * iqr_col)))]
def outlier_detect_normal(df, col):
"""
Detect outliers in numerical columns using Z-score method.
"""
m = df[col].mean()
s = df[col].std()
return df[((df[col] - m) / s).abs() > 3]
def replace_outliers(df, col, Q1, Q3, IQR):
"""
Replace outliers in numerical columns with the bounds.
"""
lower_bound = Q1[col] - 1.5 * IQR[col]
upper_bound = Q3[col] + 1.5 * I
QR[col]
df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
print(f'Outliers replaced with bounds for column: {col}')
def process_dataset(df):
"""
Process the dataset by detecting and handling outliers.
"""
continuous_cols = df.select_dtypes(include=np.number).columns.tolist()
Q1, Q3, IQR = calculate_iqr(df)
outlier_cols = []
for col in continuous_cols:
if outlier_detect(df, col, Q1, Q3, IQR).shape[0] != 0:
outlier_cols.append(col)
print(f"IQR => {col}: {outlier_detect(df, col, Q1, Q3, IQR).shape[0]}")
print(f"Z_Score => {col}: {outlier_detect_normal(df, col).shape[0]}")
print("********************************")
for col in outlier_cols:
replace_outliers(df, col, Q1, Q3, IQR)
print("\n********************************\n")
for col in outlier_cols:
print(f"Handling outliers for column: {col}")
return df
def plot_outliers(df, cols):
"""
Plot box plots and scatter plots for numerical columns with Z-Score and IQR outliers.
"""
num_cols = len(cols)
ncols = 5 # Number of columns for the subplot grid
nrows = (num_cols + ncols - 1) // ncols # Compute number of rows needed
plt.figure(figsize=(15, 5 * nrows))
for i, col in enumerate(cols):
plt.subplot(nrows, ncols, i + 1)
plt.boxplot(df[col].dropna(), vert=False)
plt.title(f'Box Plot of {col}')
plt.xlabel(col)
plt.tight_layout()
plt.show()
plt.figure(figsize=(15, 5 * nrows))
for i, col in enumerate(cols):
plt.subplot(nrows, ncols, i + 1)
plt.scatter(df.index, df[col], label='Data points', alpha=0.6)
z_outliers = outlier_detect_normal(df, col)
iqr_outliers = outlier_detect(df, col, *calculate_iqr(df))
plt.scatter(z_outliers.index, z_outliers[col], color='red', label='Z-Score Outliers', alpha=0.8)
plt.scatter(iqr_outliers.index, iqr_outliers[col], color='orange', label='IQR Outliers', alpha=0.6)
plt.title(col)
plt.xlabel('Index')
plt.ylabel(col)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
# Process and visualize outliers in the dataset
df2 = process_dataset(df1)
IQR => Year: 7 Z_Score => Year: 3 ******************************** IQR => Selling_Price: 16 Z_Score => Selling_Price: 8 ******************************** IQR => Present_Price: 13 Z_Score => Present_Price: 5 ******************************** IQR => Kms_Driven: 8 Z_Score => Kms_Driven: 3 ******************************** IQR => Owner: 11 Z_Score => Owner: 11 ******************************** Outliers replaced with bounds for column: Year Outliers replaced with bounds for column: Selling_Price Outliers replaced with bounds for column: Present_Price Outliers replaced with bounds for column: Kms_Driven Outliers replaced with bounds for column: Owner ******************************** Handling outliers for column: Year Handling outliers for column: Selling_Price Handling outliers for column: Present_Price Handling outliers for column: Kms_Driven Handling outliers for column: Owner
# Plot the outliers
continuous_cols = df2.select_dtypes(include=np.number).columns.tolist()
plot_outliers(df2, continuous_cols)
continuous_values = []
categorical_values = []
for column in df2.columns:
if df2[column].dtype == 'int64' or df2[column].dtype == 'float64':
continuous_values.append(column)
else:
categorical_values.append(column)
print("Continuous Columns:" ,continuous_values)
print("Categorical Columns :", categorical_values)
Continuous Columns: ['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven', 'Owner'] Categorical Columns : ['Car_Name', 'Fuel_Type', 'Seller_Type', 'Transmission']
# Create scaler objects
mms = MinMaxScaler() # For Min-Max scaling (Normalization)
df3 = df2.copy(deep=True)
# Apply scaling using a loop
for col in continuous_values:
df3[col] = mms.fit_transform(df3[[col]])
df3.head()
| Car_Name | Year | Selling_Price | Present_Price | Kms_Driven | Fuel_Type | Seller_Type | Transmission | Owner | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | ritz | 0.666667 | 0.241523 | 0.233845 | 0.271852 | Petrol | Dealer | Manual | 0.0 |
| 1 | sx4 | 0.583333 | 0.345564 | 0.409119 | 0.435990 | Diesel | Dealer | Manual | 0.0 |
| 2 | ciaz | 0.916667 | 0.531352 | 0.422874 | 0.065655 | Petrol | Dealer | Manual | 0.0 |
| 3 | wagon r | 0.416667 | 0.204366 | 0.169948 | 0.048215 | Petrol | Dealer | Manual | 0.0 |
| 4 | swift | 0.666667 | 0.334417 | 0.290643 | 0.323145 | Diesel | Dealer | Manual | 0.0 |
mms = StandardScaler()
df4 = df2.copy(deep=True)
# Apply scaling using a loop
for col in continuous_values:
df4[col] = mms.fit_transform(df4[[col]])
df4.head()
| Car_Name | Year | Selling_Price | Present_Price | Kms_Driven | Fuel_Type | Seller_Type | Transmission | Owner | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | ritz | 0.124223 | -0.239678 | -0.237779 | -0.308272 | Petrol | Dealer | Manual | 0.0 |
| 1 | sx4 | -0.237592 | 0.140648 | 0.412316 | 0.369728 | Diesel | Dealer | Manual | 0.0 |
| 2 | ciaz | 1.209671 | 0.819803 | 0.463336 | -1.160010 | Petrol | Dealer | Manual | 0.0 |
| 3 | wagon r | -0.961224 | -0.375509 | -0.474776 | -1.232047 | Petrol | Dealer | Manual | 0.0 |
| 4 | swift | 0.124223 | 0.099899 | -0.027115 | -0.096397 | Diesel | Dealer | Manual | 0.0 |
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_regression
%matplotlib inline
df = pd.read_csv(r"C:\Users\acer\Downloads\cardata.csv")
def plot_correlation_matrix(df, threshold):
"""
Plot the correlation matrix and highlight features with correlation above the threshold.
"""
# Select only numeric columns to avoid warning
numeric_df = df.select_dtypes(include=[np.number])
corr_matrix = numeric_df.corr()
# Plot the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1, mask=np.triu(np.ones_like(corr_matrix, dtype=bool)))
plt.title('Correlation Matrix')
plt.show()
# Find features with high correlation
correlated_features = []
for column in corr_matrix.columns:
high_corr = corr_matrix.index[abs(corr_matrix[column]) > threshold].tolist()
high_corr.remove(column) if column in high_corr else None
correlated_features.extend(high_corr)
correlated_features = list(set(correlated_features))
num_features = len(correlated_features)
return correlated_features, num_features
correlated_features, num_features = plot_correlation_matrix(df, threshold=0.6)
print(f"\nHighly correlated features (threshold > 0.6): {correlated_features} ({num_features} features)")
# Visualize the correlated features using pair plots
if num_features > 1:
sns.pairplot(df[correlated_features])
plt.suptitle('\n\nPair Plot of Highly Correlated Features', size=20)
plt.show()
# Visualize the correlated features using scatter plots
for i, feature in enumerate(correlated_features):
for j, other_feature in enumerate(correlated_features):
if i < j:
plt.figure(figsize=(8, 6))
sns.scatterplot(x=df[feature], y=df[other_feature])
plt.title(f'Scatter Plot of {feature} vs {other_feature}')
plt.xlabel(feature)
plt.ylabel(other_feature)
plt.show()
else:
print("No highly correlated features found above the threshold.")
Highly correlated features (threshold > 0.6): ['Present_Price', 'Kms_Driven', 'Seller_Type'] (3 features)
X = df1.drop(columns=['Owner', 'Selling_Price']) # Exclude 'Owner' and 'Selling_Price'
y = df1['Selling_Price']
# Calculate mutual information for regression
mutual_info = mutual_info_regression(X, y)
# Create a DataFrame for feature importance
mutual_info_df = pd.DataFrame({'Feature': X.columns, 'Mutual Information': mutual_info})
mutual_info_df.sort_values(by='Mutual Information', ascending=False, inplace=True)
# Display the mutual information DataFrame in a table
print(mutual_info_df)
# Visualize the table using matplotlib
fig, ax = plt.subplots(figsize=(12, 8)) # Set the figure size
ax.axis('off') # Hide the axes
# Create a table plot
table = ax.table(cellText=mutual_info_df.values, colLabels=mutual_info_df.columns, cellLoc='center', loc='center')
table.auto_set_font_size(False)
table.set_fontsize(12)
table.scale(1.2, 1.2)
plt.title('Mutual Information Table', fontsize=16)
plt.show()
Feature Mutual Information 2 Present_Price 1.253503 0 Car_Name 0.972751 5 Seller_Type 0.575705 1 Year 0.265208 4 Fuel_Type 0.166297 3 Kms_Driven 0.129226 6 Transmission 0.122107
# Bar plot for feature importance
plt.figure(figsize=(9, 7))
sns.barplot(x='Mutual Information', y='Feature', data=mutual_info_df, palette='viridis')
plt.title('Feature Importance based on Mutual Information')
plt.xlabel('Mutual Information')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()
# Histogram for mutual information distribution
plt.figure(figsize=(7, 5))
sns.histplot(mutual_info_df['Mutual Information'], kde=True, color='skyblue')
plt.title('Mutual Information Distribution')
plt.xlabel('Mutual Information')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
# Scatter plot for feature importance vs. mutual information
plt.figure(figsize=(7, 5))
sns.scatterplot(x='Mutual Information', y='Feature', data=mutual_info_df, hue='Mutual Information', palette='coolwarm', s=100)
plt.title('Feature Importance vs. Mutual Information')
plt.xlabel('Mutual Information')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()
In this Question, I did Exploratory Data Analysis Using 12 Different Kinds of Plots.¶import pandas as pd
from ydata_profiling import ProfileReport
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
df = pd.read_csv(r"C:\Users\acer\Downloads\cardata.csv")
for column in df.select_dtypes(include=['float64', 'int64']).columns:
plt.figure(figsize=(7, 5))
sns.histplot(df[column], kde=True)
plt.title(f'Histogram of {column}')
plt.xlabel(column)
plt.ylabel('Frequency')
plt.show()
# Define categorical columns
categorical_columns = ['Fuel_Type', 'Seller_Type', 'Transmission']
# Plot pie charts for categorical columns
for column in categorical_columns:
plt.figure(figsize=(7, 5))
colors = sns.color_palette('Set3', len(df[column].unique()))
df[column].value_counts().plot.pie(
autopct='%1.1f%%',
colors=colors,
startangle=90,
wedgeprops={'alpha':0.8},
textprops={'color':"black"}
)
plt.title(f'Distribution of {column}', fontsize=16)
plt.ylabel('')
plt.show()
# 3. Scatter Plot for Relationships Between Two Numerical Variables
plt.figure(figsize=(7, 5))
sns.scatterplot(x='Present_Price', y='Selling_Price', hue='Fuel_Type', data=df, palette='viridis')
plt.title('Scatter Plot of Present Price vs. Selling Price')
plt.xlabel('Present Price')
plt.ylabel('Selling Price')
plt.show()
# 4. Heatmap for Correlation Matrix
plt.figure(figsize=(12, 8))
corr_matrix = df.select_dtypes(include=['float64', 'int64']).corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()
# 5. Bar Plot for Categorical vs. Numerical
plt.figure(figsize=(7, 5))
sns.barplot(x='Fuel_Type', y='Selling_Price', data=df, palette='viridis')
plt.title('Bar Plot of Selling Price by Fuel Type')
plt.xlabel('Fuel Type')
plt.ylabel('Selling Price')
plt.show()
# 6. Line Plot for Trends
plt.figure(figsize=(7, 5))
sns.lineplot(x='Year', y='Selling_Price', data=df, marker='o')
plt.title('Line Plot of Selling Price over Years')
plt.xlabel('Year')
plt.ylabel('Selling Price')
plt.show()
# 7. Joint Plot for Detailed Relationships
sns.jointplot(x='Present_Price', y='Selling_Price', data=df, hue='Fuel_Type', palette='viridis')
plt.suptitle('Joint Plot of Present Price vs. Selling Price', y=1.02)
plt.show()
# 8. 3D Scatter Plot
fig = plt.figure(figsize=(10, 7))
ax = fig.add_subplot(111, projection='3d')
# Plotting the scatter plot
sc = ax.scatter(df['Present_Price'], df['Kms_Driven'], df['Selling_Price'],
c=df['Year'], cmap='viridis', s=50, alpha=0.8, edgecolors='w', linewidth=0.5)
# Adding labels and title
ax.set_xlabel('Present Price', fontsize=12)
ax.set_ylabel('Kms Driven', fontsize=12)
ax.set_zlabel('Selling Price', fontsize=12)
ax.set_title('3D Scatter Plot of Present Price, Kms Driven, and Selling Price', fontsize=14)
# Adding a color bar
cbar = plt.colorbar(sc, ax=ax, pad=0.1)
cbar.set_label('Year', rotation=270, labelpad=15)
# Adding grid for better readability
ax.grid(True)
# Setting a specific view angle
ax.view_init(elev=20., azim=30)
# Show plot
plt.show()
# 9. Scatter Regression Plot
plt.figure(figsize=(7, 5))
sns.regplot(x='Present_Price', y='Selling_Price', data=df, scatter_kws={'color': 'blue'}, line_kws={'color': 'yellow'})
plt.title('Regression Plot of Present Price vs Selling Price')
plt.show()
# 10. Bubble Plot
plt.figure(figsize=(7, 5))
sns.scatterplot(x='Present_Price', y='Selling_Price', size='Kms_Driven', hue='Year', data=df, alpha=0.9, sizes=(20, 200), palette='viridis')
plt.title('Bubble Plot of Present Price vs Selling Price with Kms Driven')
plt.show()
# 11. Multi-Feature Pair Plot
sns.pairplot(df, hue='Fuel_Type', palette='coolwarm', diag_kind='kde', height=3, aspect=1.3, plot_kws={'alpha': 0.9, 's': 50, 'edgecolor': 'w', 'linewidth': 0.5})
plt.show()
# 12. Facet Grid Plot
g = sns.FacetGrid(df, col='Fuel_Type', hue='Transmission', height=5, aspect=1.2, palette='Set1', col_wrap=2)
g.map(sns.scatterplot, 'Present_Price', 'Selling_Price', s=100, alpha=0.3, edgecolor='w', linewidth=0.5)
g.add_legend()
g.set_axis_labels('Present Price', 'Selling Price')
g.fig.suptitle('Facet Grid Plot of Present Price vs. Selling Price by Fuel Type and Transmission', y=1.05)
# Additional customizations
for ax in g.axes.flat:
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv("C:\\Users\\acer\\Downloads\\data.csv")
df .head()
| id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | ... | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | Unnamed: 32 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 842302 | M | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | ... | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 | NaN |
| 1 | 842517 | M | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | ... | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 | NaN |
| 2 | 84300903 | M | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | ... | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 | NaN |
| 3 | 84348301 | M | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | ... | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 | NaN |
| 4 | 84358402 | M | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | ... | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 | NaN |
5 rows × 33 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 569 entries, 0 to 568 Data columns (total 33 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 569 non-null int64 1 diagnosis 569 non-null object 2 radius_mean 569 non-null float64 3 texture_mean 569 non-null float64 4 perimeter_mean 569 non-null float64 5 area_mean 569 non-null float64 6 smoothness_mean 569 non-null float64 7 compactness_mean 569 non-null float64 8 concavity_mean 569 non-null float64 9 concave points_mean 569 non-null float64 10 symmetry_mean 569 non-null float64 11 fractal_dimension_mean 569 non-null float64 12 radius_se 569 non-null float64 13 texture_se 569 non-null float64 14 perimeter_se 569 non-null float64 15 area_se 569 non-null float64 16 smoothness_se 569 non-null float64 17 compactness_se 569 non-null float64 18 concavity_se 569 non-null float64 19 concave points_se 569 non-null float64 20 symmetry_se 569 non-null float64 21 fractal_dimension_se 569 non-null float64 22 radius_worst 569 non-null float64 23 texture_worst 569 non-null float64 24 perimeter_worst 569 non-null float64 25 area_worst 569 non-null float64 26 smoothness_worst 569 non-null float64 27 compactness_worst 569 non-null float64 28 concavity_worst 569 non-null float64 29 concave points_worst 569 non-null float64 30 symmetry_worst 569 non-null float64 31 fractal_dimension_worst 569 non-null float64 32 Unnamed: 32 0 non-null float64 dtypes: float64(31), int64(1), object(1) memory usage: 146.8+ KB
df.shape
(569, 33)
df.drop("Unnamed: 32",axis=1,inplace=True)
df.drop("diagnosis",axis=1,inplace=True)
df.shape
(569, 31)
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df)
df_pca = pd.DataFrame(scaled_df, columns=df.columns)
df_pca.head()
| id | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | concave points_mean | symmetry_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | concave points_worst | symmetry_worst | fractal_dimension_worst | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.236405 | 1.097064 | -2.073335 | 1.269934 | 0.984375 | 1.568466 | 3.283515 | 2.652874 | 2.532475 | 2.217515 | ... | 1.886690 | -1.359293 | 2.303601 | 2.001237 | 1.307686 | 2.616665 | 2.109526 | 2.296076 | 2.750622 | 1.937015 |
| 1 | -0.236403 | 1.829821 | -0.353632 | 1.685955 | 1.908708 | -0.826962 | -0.487072 | -0.023846 | 0.548144 | 0.001392 | ... | 1.805927 | -0.369203 | 1.535126 | 1.890489 | -0.375612 | -0.430444 | -0.146749 | 1.087084 | -0.243890 | 0.281190 |
| 2 | 0.431741 | 1.579888 | 0.456187 | 1.566503 | 1.558884 | 0.942210 | 1.052926 | 1.363478 | 2.037231 | 0.939685 | ... | 1.511870 | -0.023974 | 1.347475 | 1.456285 | 0.527407 | 1.082932 | 0.854974 | 1.955000 | 1.152255 | 0.201391 |
| 3 | 0.432121 | -0.768909 | 0.253732 | -0.592687 | -0.764464 | 3.283553 | 3.402909 | 1.915897 | 1.451707 | 2.867383 | ... | -0.281464 | 0.133984 | -0.249939 | -0.550021 | 3.394275 | 3.893397 | 1.989588 | 2.175786 | 6.046041 | 4.935010 |
| 4 | 0.432201 | 1.750297 | -1.151816 | 1.776573 | 1.826229 | 0.280372 | 0.539340 | 1.371011 | 1.428493 | -0.009560 | ... | 1.298575 | -1.466770 | 1.338539 | 1.220724 | 0.220556 | -0.313395 | 0.613179 | 0.729259 | -0.868353 | -0.397100 |
5 rows × 31 columns
scaled_df
array([[-0.23640517, 1.09706398, -2.07333501, ..., 2.29607613,
2.75062224, 1.93701461],
[-0.23640344, 1.82982061, -0.35363241, ..., 1.0870843 ,
-0.24388967, 0.28118999],
[ 0.43174109, 1.57988811, 0.45618695, ..., 1.95500035,
1.152255 , 0.20139121],
...,
[-0.23572747, 0.70228425, 2.0455738 , ..., 0.41406869,
-1.10454895, -0.31840916],
[-0.23572517, 1.83834103, 2.33645719, ..., 2.28998549,
1.91908301, 2.21963528],
[-0.24240586, -1.80840125, 1.22179204, ..., -1.74506282,
-0.04813821, -0.75120669]])
PCA is:
# Perform PCA
pca = PCA()
pca_components = pca.fit_transform(scaled_df)
# Creating a DataFrame for the principal components
pca_df = pd.DataFrame(data=pca_components, columns=[f"PC{i+1}" for i in range(pca_components.shape[1])])
pca_df.head()
| PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | PC10 | ... | PC22 | PC23 | PC24 | PC25 | PC26 | PC27 | PC28 | PC29 | PC30 | PC31 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 9.183200 | 1.971271 | -1.171625 | -3.639332 | 1.193098 | 1.372280 | 0.371844 | 2.180529 | -0.231610 | -0.090396 | ... | -0.107371 | -0.069634 | -0.085232 | -0.175628 | -0.150774 | 0.200807 | -0.253193 | 0.033911 | -0.045572 | 0.047166 |
| 1 | 2.383298 | -3.753459 | -0.580229 | -1.127438 | -0.624850 | 0.126616 | -0.288270 | 0.044935 | 0.426916 | -0.659939 | ... | 0.075191 | 0.091740 | 0.213925 | 0.010368 | -0.170210 | 0.042420 | 0.180649 | -0.032630 | 0.005902 | 0.001845 |
| 2 | 5.742472 | -1.080350 | -0.533088 | -0.903470 | 0.180601 | 0.401855 | 0.463188 | -0.715209 | -0.010712 | -0.082305 | ... | -0.303285 | 0.058930 | 0.074145 | 0.103834 | 0.170749 | -0.005066 | 0.049887 | -0.047022 | -0.003290 | -0.000735 |
| 3 | 7.124384 | 10.272225 | -3.150161 | -0.121512 | 2.968055 | 2.561668 | 1.950177 | 1.287990 | 1.272638 | -1.171376 | ... | -0.410865 | 0.205120 | 0.135322 | 0.158659 | 0.075684 | 0.273048 | 0.184350 | -0.042465 | 0.068640 | 0.020001 |
| 4 | 3.945694 | -1.959689 | 1.401177 | -2.937555 | -0.540853 | -1.233300 | -0.205435 | -0.959329 | 0.629119 | -0.166354 | ... | 0.117341 | 0.020423 | -0.135431 | -0.004874 | 0.002884 | -0.039637 | 0.032586 | 0.034790 | -0.005183 | -0.021199 |
5 rows × 31 columns
# Print the number of components and the explained variance ratio
num_components = pca.n_components_
explained_variance_ratio = pca.explained_variance_ratio_
# Create DataFrame for explained variance ratio
explained_variance_df = pd.DataFrame({
'Principal Component': [f'PC{i+1}' for i in range(num_components)],
'Explained Variance Ratio': explained_variance_ratio
})
print("Number of Principal Components:", num_components)
print("\nExplained Variance Ratio of each Principal Component:")
explained_variance_df.head(7)
Number of Principal Components: 31 Explained Variance Ratio of each Principal Component:
| Principal Component | Explained Variance Ratio | |
|---|---|---|
| 0 | PC1 | 0.428647 |
| 1 | PC2 | 0.183768 |
| 2 | PC3 | 0.091464 |
| 3 | PC4 | 0.063915 |
| 4 | PC5 | 0.053188 |
| 5 | PC6 | 0.039828 |
| 6 | PC7 | 0.031557 |
# Cumulative Explained Variance Ratio
cumulative_explained_variance = np.cumsum(explained_variance_ratio)
cumulative_variance_df = pd.DataFrame({
'Principal Component': [f'PC{i+1}' for i in range(num_components)],
'Explained Variance Ratio': explained_variance_ratio,
'Cumulative Explained Variance': cumulative_explained_variance
})
print("\nCumulative Explained Variance Ratio:")
cumulative_variance_df.head(7)
Cumulative Explained Variance Ratio:
| Principal Component | Explained Variance Ratio | Cumulative Explained Variance | |
|---|---|---|---|
| 0 | PC1 | 0.428647 | 0.428647 |
| 1 | PC2 | 0.183768 | 0.612415 |
| 2 | PC3 | 0.091464 | 0.703879 |
| 3 | PC4 | 0.063915 | 0.767794 |
| 4 | PC5 | 0.053188 | 0.820982 |
| 5 | PC6 | 0.039828 | 0.860810 |
| 6 | PC7 | 0.031557 | 0.892367 |
# Principal Component Variance Explained Plot
plt.figure(figsize=(11, 4))
plt.bar(range(1, num_components + 1), explained_variance_ratio, color='skyblue')
plt.title('Variance Explained by Each Principal Component')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.xticks(range(1, num_components + 1))
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()
# Scree Plot
plt.figure(figsize=(11, 4))
plt.plot(range(1, num_components + 1), explained_variance_ratio, marker='o', linestyle='--', color='yellow')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.grid(True)
plt.xticks(range(1, num_components + 1))
plt.show()
# Cumulative Explained Variance Plot
plt.figure(figsize=(11, 4))
plt.plot(range(1, num_components + 1), cumulative_explained_variance, marker='o', linestyle='-', color='r')
plt.title('Cumulative Explained Variance')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.grid(True)
plt.xticks(range(1, num_components + 1))
plt.show()
# Combined Scree Plot
plt.figure(figsize=(11, 4))
plt.plot(range(1, num_components + 1), explained_variance_ratio, marker='o', linestyle='--', color='yellow', label='Explained Variance')
plt.plot(range(1, num_components + 1), cumulative_explained_variance, marker='o', linestyle='-', color='r', label='Cumulative Explained Variance')
plt.title('Explained and Cumulative Variance')
plt.xlabel('Principal Component')
plt.ylabel('Variance')
plt.legend()
plt.grid(True)
plt.show()
#Combine PC1 and PC2 for the color gradient
pca_df['Color'] = np.sqrt(pca_df['PC1']**2 + pca_df['PC2']**2) # Magnitude of the vector
# Define a colormap for the gradient
cmap = plt.get_cmap('viridis')
# Create the scatter plot with gradient colors
plt.figure(figsize=(11, 6))
scatter = plt.scatter(
pca_df['PC1'], pca_df['PC2'],
c=pca_df['Color'], cmap=cmap,
alpha=0.8, edgecolors='k', s=100
)
plt.colorbar(scatter, label='Combined PC1 and PC2 Value')
plt.xlabel('First Principal Component (PC1)')
plt.ylabel('Second Principal Component (PC2)')
plt.title('Scatter Plot of PC1 vs. PC2 with Combined Color Gradient')
plt.grid(True)
plt.show()
# Calculate the magnitude for color gradient
pca_df['Magnitude'] = np.sqrt(pca_df['PC1']**2 + pca_df['PC2']**2 + pca_df['PC3']**2)
# Define a colormap
cmap = plt.get_cmap('coolwarm')
# Create 3D scatter plot
fig = plt.figure(figsize=(12, 7))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(
pca_df['PC1'], pca_df['PC2'], pca_df['PC3'],
c=pca_df['Magnitude'], cmap=cmap, edgecolor='k', s=50
)
fig.colorbar(scatter, ax=ax, label='Magnitude of PC1, PC2, and PC3')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
ax.set_title('3D Scatter Plot of First Three Principal Components with Magnitude Color Gradient')
plt.show()
explained_variance = pca.explained_variance_ratio_
# Get the indices of the best (most significant) and worst (least significant) components based on explained variance
best_index = np.argmax(explained_variance) # Index of the component with the highest explained variance
worst_index = np.argmin(explained_variance) # Index of the component with the lowest explained variance
# Plot Loading Scores of Best and Worst Components
plt.figure(figsize=(9, 5))
plt.plot(pca.components_[best_index, :], label=f'Best Component (PC{best_index+1}) - Variance: {explained_variance[best_index]:.2f}')
plt.plot(pca.components_[worst_index, :], label=f'Worst Component (PC{worst_index+1}) - Variance: {explained_variance[worst_index]:.2f}')
plt.title('Loading Scores of Best and Worst Principal Components')
plt.xlabel('Feature Index')
plt.ylabel('Loading Score')
plt.legend(loc='best')
plt.grid(True)
plt.show()